library(readxl)
library(tidytable)
##
## Attaching package: 'tidytable'
## The following objects are masked from 'package:stats':
##
## dt, filter, lag
## The following object is masked from 'package:base':
##
## %in%
conflicted::conflict_prefer_all('tidytable', quiet = TRUE)
data_folder <- "data"
# load enable data
enable_data <- read_xlsx(file.path(data_folder, "enable_Datensatz_RMR._Erwachsene 1.xlsx"), skip = 2, na=c('NA', 'N/A', ''))
# fix compound values
compounds <- c(
"acetic acid a", "butyric acid a", "propionic acid a",
"2-Methylbutyrate a", "hexanoic acid c", "Isobutyrate a",
"Isovalerate a", "pentanoic acid a",
"4-Methylvaleric acid a", "Lactic acid a"
)
# for all columns that have a comma in their name, replace commas in the column with a dot
for (column_name in names(enable_data)) {
if (is.character(enable_data[[column_name]])) {
enable_data[[column_name]] <-
sub(",", ".", enable_data[[column_name]], fixed = TRUE)
# for compounds, replace "<0" with 0 to be able to convert to numeric
if (column_name %in% compounds) {
# replace "<0" with 0
enable_data[[column_name]] <-
ifelse(enable_data[[column_name]] == "<0" |
enable_data[[column_name]] == "< 0", 0, enable_data[[column_name]])
}
}
}
# write to csv and read again using fread to get the correct data types
data_csv <- file.path(data_folder, 'enable_data.tsv')
fwrite(enable_data, file = data_csv)
enable_data <- data.table::fread(data_csv, stringsAsFactors = TRUE, na=c('NA', 'N/A', ''))
# add 1 to the weight of the nuernberg site
enable_data[Site == "nuernber", `GEWICHt_SECA, kg` := `GEWICHt_SECA, kg` + 1]
enable_data[Site == "nuernber", Site := "Nuremberg"]
enable_data[Site == "freising", Site := "Freising"]
# remove unnecessary column:
if(all(enable_data[, `Probanden-ID` == Label]))
invisible(enable_data[, `Probanden-ID` := NULL])
# load microbiome mapping
microbiome_mapping <- data.table::setDT(read_xlsx(file.path(data_folder, "mapping_file 1.xlsx")))
microbiome_mapping[, sample_clean := data.table::tstrsplit(".", x = `#SampleID`, fixed = TRUE, keep = 2)]
# check uniqueness
stopifnot(!any(microbiome_mapping[, duplicated(sample_clean)]))
# load microbiome data
microbiome_data <- fread(file.path(data_folder, "tax.summary.all 1.tab"))
stripped_names <- unlist(data.table::tstrsplit(".", x = names(microbiome_data), fixed = TRUE, keep = 2))
# check uniqueness
stopifnot(!any(duplicated(stripped_names)))
# clean names
names(microbiome_data) <- stripped_names
# get taxa names
taxa_names <- microbiome_data[, V1]
# transpose data
microbiome_data <- microbiome_data |> data.table::transpose(keep.names = "sample_clean", make.names = "V1")
# merge microbiome data with mapping
microbiome_data[microbiome_mapping, on=.(sample_clean), c("Label", "Cohort"):=.(Code, as.character(Cohort))]
# merge with enable data
invisible(enable_data[microbiome_data, on=.(Label), (c(taxa_names, "Cohort")) := mget(c(taxa_names, "Cohort"))])
# create field for first letter of label, i.e., categorical Age and Site
invisible(enable_data[, Label_group:=substr(Label, 1, 1)])
# diversity columns
diversity_columns <- c("Shannon.effective", "Simpson.effective")
# load diversity data
diversity_data <- fread('data/Final table.tab', stringsAsFactors=TRUE) |> data.table::setnames(old='V1', new="Label") |> select(Label, all_of(diversity_columns))
invisible(enable_data[diversity_data, on=.(Label), (c(diversity_columns)) := mget(diversity_columns)])
name_mapping <- c(
"SEX" = "Sex",
"Alter, Jahre" = "Age",
"mean_temp" = "Mean Outdoor Temperature",
"Datum_V1" = "Date",
"GROESSE, cm" = "Height",
"GEWICHt_SECA, kg" = "Weight",
"FETTMASSE_SECA, kg" = "FM",
"FFM_SECA,kg" = "FFM",
"VISZFETT,l" = "Visceral Fat",
"TAILLENUMFANG,cm" = "Waist",
"HUEFTUMFANG, cm" = "Hip",
"Körpertemperatur, °C" = "Body Temperature",
"RMR.KJ" = "RMR",
"LEUKOZYTEN/nl" = "Leukocytes",
"hsCRP, mg/dl" = "hsCRP",
"INSULIN, µU/ml" = "Insulin",
"TSH, µU/ml" = "TSH",
"FT3, pg/ml" = "fT3",
"Hämoglobin, g/dl" = "Hemoglobin",
"Hämatokrit, %" = "Hematocrit",
"MCHC, g/dl" = "MCHC",
"GOTAST, U/l" = "GOT AST",
"GPTALT, U/l" = "GPT ALT",
"GGT, U/l" = "GGT",
"LDH, U/l" = "LDH",
"Blutzucker, mg/dl" = "Glucose",
"Cholesterin, , mg/dl" = "Cholesterol",
"Triglyzeride, mg/dl" = "Triglycerides",
"HDLCHOL, mg/dl" = "HDL",
"LDLCHOL, mg/dl" = "LDL",
"KREATIN, mg/dl" = "Creatinine",
"GFRCKDE, mg/min" = "GFR",
"HARNSTOFF, mg/dl" = "Urea",
"Harnsäure, mg/dl" = "Uric Acid",
"EISEN,µg/dl" = "Iron",
"FERRITIN,ng/ml" = "Ferritin",
"Blutdruck_systolisch, mmHg" = "BP Systolic",
"Blutdruck_diastolisch, mmHg" = "BP Diastolic",
"Herzfrequenz, Schläge pro Minute" = "Heart Rate"
)
data.table::setnames(enable_data, old = names(name_mapping), new = name_mapping)
# groups for taxa
taxa_group <-
c(
k = 'kingdom',
p = 'phylum',
c = 'class',
o = 'order',
f = 'family',
g = 'genus'
)[data.table::tstrsplit(taxa_names,
split = '__',
keep = 1,
fixed = TRUE)[[1]]]
# match the columns to certain groups
column_groups <- rbind(data.table::data.table(column = 'Label_group', group = "General information"),
fread(file.path(data_folder, 'column_groups.tsv')),
data.table::data.table(column = taxa_names, group = paste("microbiome", taxa_group)),
data.table::data.table(column = diversity_columns, group = paste("microbiome diversity")))
column_groups[, group:=as.factor(group)]
column_groups[column %in% names(name_mapping), column := name_mapping[column]]
# set general response variable
response_variable <- "RMR"
# Conversion factor kcal to kilojoule
conversion_factor <- 4.184
# define grouping column
grouping_column <- "Label_group"
# get other labelling columns
other_labelling_columns <- c("Label", "Site", "Cohort", "Date")
# set the predictor of the established model
age_predictor <- "Age"
sex_predictor <- "Sex"
weight_predictor <- "Weight"
height_predictor <- "Height"
basic_predictors <- c(age_predictor, sex_predictor, weight_predictor, height_predictor, "FM", "FFM")
# define microbial predictors (family level)
microbiome_predictors <- column_groups[group == 'microbiome family', column]
# # other microbial columns
# other_microbial_columns <- setdiff(taxa_names, microbial_predictors)
temperature_predictor <- "Mean Outdoor Temperature"
# define the general predictors
general_predictors <- setdiff(names(enable_data), c(response_variable, grouping_column, other_labelling_columns, taxa_names))
# make sure all predictors are in the data
stopifnot(all(c(response_variable, grouping_column, other_labelling_columns, taxa_names, basic_predictors, microbiome_predictors, general_predictors) %in% names(enable_data)))
kiel_data <- data.table::setDT(read_xlsx(file.path(data_folder, "REE Datenbank TUM_Januar 2025tf QM.xlsx")))
## New names:
## • `` -> `...12`
## • `?` -> `?...14`
## • `?` -> `?...15`
## • `?` -> `?...16`
## • `?` -> `?...17`
# make new SampleID from ID and zusätzliche Codierung
kiel_data[, Site := "Kiel"]
kiel_data[, Label := paste0(ID, "_", `zusätzliche Codierung`)]
kiel_data[, Sex := factor(ifelse(`Geschlecht` == 1, "männlich", "weiblich"), levels = levels(enable_data$Sex))]
kiel_data[, Age := Alter]
kiel_data[, Height := `Größe (m)` * 100]
kiel_data[, FM := FM_ADP_kg]
kiel_data[, Weight := `Gewicht (kg)`]
kiel_data[, FFM := Weight - FM]
kiel_data[, Date := `Datum`]
kiel_data[, `Mean Outdoor Temperature` := `daily mean T`]
kiel_data[, RMR := `REE (kcal)` * conversion_factor]
kiel_predictors <- setdiff(intersect(names(enable_data), names(kiel_data)),
other_labelling_columns)
# remove response from kiel predictors
kiel_predictors <- setdiff(kiel_predictors, response_variable)
kiel_data[, (names(enable_data)[!names(enable_data) %in% names(kiel_data)]) := NA]
kiel_only <- setdiff(names(kiel_data), names(enable_data))
kiel_data[, (kiel_only) := NULL]
kiel_data <- kiel_data[, names(enable_data), with=FALSE]
all_data <- rbind(enable_data, kiel_data, use.names=TRUE)
all_data[, Site:=factor(Site, levels = c('Freising', 'Nuremberg', 'Kiel'))]
suppressPackageStartupMessages(library(tidymodels))
tidymodels_prefer()
# number of NA responses
message(paste("Number of NAs in response: ", all_data |> filter(is.na(get(response_variable))) |> nrow()))
## Number of NAs in response: 8
all_data <- all_data |> subset(!is.na(get(response_variable)))
data.table::transpose(all_data[, c(n = .N, lapply(.SD, function(x) paste(round(mean(x, na.rm=TRUE), 2), round(sd(x, na.rm=TRUE), 2), sep = " ± "))), by=Site, .SDcols = c(age_predictor, "FM", "FFM", response_variable)], keep.names = 'Column', make.names = 'Site')
## Column Freising Nuremberg Kiel
## <char> <char> <char> <char>
## 1: N 351 103 361
## 2: Age 48.2 ± 18.91 78.28 ± 2.86 43.18 ± 15.19
## 3: FM 23.21 ± 10.15 28.48 ± 7.78 32.4 ± 14.85
## 4: FFM 53.82 ± 11.45 47.05 ± 10.63 52.38 ± 13.36
## 5: RMR 6868.22 ± 1218.15 6399.38 ± 1198.19 6886.54 ± 1308.15
# set seed for reproducibility
set.seed(1)
# split the data by Site to have a test sets
enable_split <- group_initial_split(all_data[Site %in% c('Freising', 'Nuremberg')], group = "Site")
train_data <- enable_split |> training()
test_data <- enable_split |> testing()
# ensure that Site == "freising" in the whole train_data and nuremberg_data
stopifnot(all(train_data$Site == 'Freising'))
stopifnot(all(test_data$Site == 'Nuremberg'))
# Number of samples that have at least one NA value
message(paste("Number of NAs in any variable: ", train_data |>
filter_all(any_vars(is.na(.))) |>
nrow()))
## Number of NAs in any variable: 42
# set number of folds and repeats
nfolds <- 5
nrepeats <- 10
set.seed(1102)
repeated_cv_split <- rsample::vfold_cv(train_data, v = nfolds, repeats = nrepeats)
# create recipes
general_recipe <-
recipe(train_data) |>
update_role(all_of(taxa_names), new_role = "microbiome") |>
update_role(all_of(grouping_column), new_role = "splitting indicator") |>
update_role(all_of(other_labelling_columns), new_role = "labels") |>
update_role(all_of(response_variable), new_role = "outcome") |>
update_role(all_of(microbiome_predictors), new_role = "predictor") |>
update_role(all_of(general_predictors), new_role = "predictor") |>
step_log(all_of(microbiome_predictors), offset = 1) |>
step_impute_mean(all_numeric_predictors()) |>
step_zv(all_predictors()) |>
step_dummy(all_nominal_predictors())
normalized_recipe <- general_recipe |>
step_normalize(all_numeric_predictors())
accessible_predictors <- kiel_predictors[kiel_predictors != temperature_predictor]
general_accessible_pred_recipe <- general_recipe |>
update_role(all_of(general_predictors), new_role = "old_predictor") |>
update_role(all_of(microbiome_predictors), new_role = "old_predictor") |>
update_role(all_of(accessible_predictors), new_role = "predictor")
normalized_accessible_pred_recipe <- general_accessible_pred_recipe |>
step_normalize(all_numeric_predictors())
# recipe list
recipe_list <- list(general = general_recipe,
normalized = normalized_recipe,
generalAccessPred = general_accessible_pred_recipe,
normalizedAccessPred = normalized_accessible_pred_recipe)
my_metrics <- metric_set(rmse, ccc, rsq, rsq_trad, mae)
main_metric <- names(attr(my_metrics, "metrics"))[1]
# linear model
lm_model <-
linear_reg() |>
set_engine("lm")
# lasso model
lasso_model <-
linear_reg(penalty = tune(), mixture = 1) |>
set_engine("glmnet", standardize = TRUE)
# random forest
rf_model <-
rand_forest(trees = tune(), min_n = tune(), mtry = tune()) |>
set_engine("ranger", importance = 'permutation') |>
set_mode("regression")
# neural network
nnet_model <-
mlp(hidden_units = tune(), penalty = tune(), epochs = tune()) |>
set_engine("nnet", MaxNWts = 2600) |>
set_mode("regression")
# model list
model_list <- list(linear = lm_model,
lasso = lasso_model,
RF = rf_model,
nnet = nnet_model)
# set the same parameters for all penalties
penalty_range <- penalty(range=c(-10, 3))
# set the parameters for the lasso
lasso_params <-
lasso_model |>
extract_parameter_set_dials() |>
update(penalty = penalty_range)
# set the parameters for the RF
rf_params <-
rf_model |>
extract_parameter_set_dials()
# set the parameters for the NN
nnet_params <-
nnet_model |>
extract_parameter_set_dials() |>
update(hidden_units = hidden_units(c(1, 27))) |>
update(penalty = penalty_range)
params_list <- list(lasso = lasso_params,
RF = rf_params,
nnet = nnet_params)
grid_size <- 100
# set the workflows
workflows <- workflow_set(
preproc = c(rep(recipe_list['general'], 3),
rep(recipe_list['normalized'], 1),
rep(recipe_list['generalAccessPred'], 3),
rep(recipe_list['normalizedAccessPred'], 1)),
models = rep(model_list, 2),
cross = FALSE)
# iterate over the wflow_ids, add paramters and generate the mtry parameter dynamically
for (this_wflow_id in workflows$wflow_id){
model_name <- data.table::tstrsplit(this_wflow_id, '_', fixed = TRUE, keep = 2)[[1]]
# set the parameters for the RF in question
params <- params_list[[model_name]]
if (model_name == "RF"){
params <- params |>
update(mtry = mtry(c(
1, sum(recipe_list[[sub("_.*", "", this_wflow_id)]]$var_info$role == "predictor")
)))
}
workflows <- workflows |>
option_add(param_info = params, id = this_wflow_id)
}
require(doMC)
## Loading required package: doMC
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
doMC::registerDoMC(cores = min(nrepeats*nfolds, parallel::detectCores() - 2))
repeated_cv_result_file <- "repeated_cv_results.rds"
if (file.exists(repeated_cv_result_file)) {
print("loading grid results")
repeated_cv_result <- readRDS(repeated_cv_result_file)
} else {
print("creating grid results")
repeated_cv_result <- workflows |>
workflow_map(
fn = "tune_grid",
seed = 1503,
resamples = repeated_cv_split,
grid = grid_size,
verbose = TRUE,
metrics = my_metrics,
control = control_grid(
save_pred = TRUE
)
)
saveRDS(repeated_cv_result, repeated_cv_result_file)
}
## [1] "loading grid results"
library(ggpubr)
library(ggplot2)
ggplot2::theme_set(ggplot2::theme_bw() + theme(strip.background = element_rect(fill = "white")))
fig_dir <- "figures"
if(!dir.exists(fig_dir)) dir.create(fig_dir, recursive = TRUE)
autoplot(repeated_cv_result, type="wflow_id")
wflow_ids <- repeated_cv_result$wflow_id
wflow_ids_wo_linear <- wflow_ids[!endsWith(wflow_ids, "_linear")]
final_models <- sapply(wflow_ids_wo_linear, function(this_wflow_id) {
wflow_results <- repeated_cv_result |>
extract_workflow_set_result(this_wflow_id)
# print(autoplot(wflow_results) + labs(title = this_wflow_id))
specs <- workflows |> extract_spec_parsnip(this_wflow_id)
tuning_params <- specs |> extract_parameter_set_dials() |> pull(name)
if (inherits(specs, "linear_reg")) {
if (all(c("penalty", "mixture") %in% tuning_params))
best_model <- wflow_results |> select_by_one_std_err(metric = main_metric, desc(penalty), mixture)
else if ("penalty" %in% tuning_params) # Lasso
best_model <- wflow_results |> select_by_one_std_err(metric = main_metric, desc(penalty))
else if ("mixture" %in% tuning_params)
best_model <- wflow_results |> select_by_one_std_err(metric = main_metric, mixture)
} else if (inherits(specs, "rand_forest")) {
best_model <- wflow_results |> select_by_one_std_err(metric = main_metric, trees, mtry, desc(min_n))
} else if (inherits(specs, "mlp")) {
best_model <- wflow_results |> select_by_one_std_err(metric = main_metric, epochs, hidden_units, desc(penalty))
} else {
stop("Unknown model")
}
best_model
}, simplify = FALSE)
one_std_err_results <-
data.table::rbindlist(
final_models,
idcol = 'wflow_id',
fill = TRUE
) |>
select(wflow_id, .config)
best_results <- repeated_cv_result |>
rank_results(rank_metric = main_metric, select_best = TRUE) |>
select(wflow_id, .config) |>
unique()
results_to_keep <- rbind(one_std_err_results, best_results)
repeated_cv_result_dt <- repeated_cv_result |> rank_results(rank_metric = main_metric) |> semi_join(results_to_keep)
repeated_cv_result_dt[, rank := factor(rank, levels = sort(unique(rank)))]
ggplot(repeated_cv_result_dt, aes(x = rank, y = mean, color = wflow_id)) +
geom_point() +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), width = .5) +
facet_wrap(~.metric, scales = 'free')
final_fits <- sapply(names(final_models), function(this_wflow_id)
repeated_cv_result |>
extract_workflow(this_wflow_id) |>
finalize_workflow(final_models[[this_wflow_id]]) |>
last_fit(split = enable_split, metrics=my_metrics),
simplify = FALSE)
linear_models <- sapply(wflow_ids[!wflow_ids %in% wflow_ids_wo_linear], function(this_wflow_id){
this_recipe_name <- sub(pattern = '_.*$', replacement = '', this_wflow_id)
workflow() |>
add_recipe(recipe_list[[this_recipe_name]]) |>
add_model(lm_model) |>
last_fit(split = enable_split, metrics = my_metrics)
}, simplify = FALSE)
fits <- c(final_fits, linear_models)
repeated_cv_result_dt <- repeated_cv_result_dt |> semi_join(one_std_err_results |> rbind(best_results |> subset(endsWith(wflow_id, "linear"))))
main_model <- repeated_cv_result_dt[.metric == main_metric][which.min(mean), wflow_id]
library(glmnet)
feature_colors <- c(
"FFM" = "#009999", # Teal (greenish for FFM_SECA)
"Weight" = "#B2DF8A", # Light Green (for GEWICHt_SECA)
"Mean Outdoor Temperature" = "#000000", # Black (for mean_temp)
"GFR" = "#ff6db6", # Pink (blood measure GFRCKDE)
"fT3" = "#b66dff", # Purple (blood measure fT3)
"MCHC" = "#490092", # Dark Purple (blood measure MCHC)
"Unexplained" = "#db6d00", # Orange (for Unexplained)
"Intra-Individual Variation" = "#FDB462" # Light Orange (for Individual Variation)
)
stability_dt <- data.table::rbindlist(sapply(recipe_list[c("general", "generalAccessPred")],
FUN = function(this_recipe){
# prepare x and y data
prepped_training_data <- this_recipe |> step_select(c(all_predictors(), all_outcomes())) |> prep() |> bake(NULL)
x <- prepped_training_data |> select(-any_of(response_variable)) |> as.matrix()
y <- prepped_training_data |> pull(response_variable)
# get the repeats from our repeated split
repeat_ids <- repeated_cv_split$id |> unique()
# lasso runs per repeat
cvfits <-
lapply(repeat_ids, function(repeat_id) {
# Filter the splits for the specified repeat
repeat_splits <- repeated_cv_split |> subset(id == repeat_id)
# Initialize a fold ID vector (length = number of rows in the data)
fold_ids <- integer(prepped_training_data |> nrow())
# Assign fold numbers to each observation based on assessment sets
for (i in seq_along(repeat_splits$splits)) {
fold <- repeat_splits$splits[[i]]
assessment_indices <- tidy(fold) |> subset(Data == "Assessment") |> pull(Row)
fold_ids[assessment_indices] <- i # Assign fold number `i` to assessment indices
}
stopifnot(all(fold_ids>0))
lasso <-
cv.glmnet(
x = x,
y = y,
alpha = 1,
family = "gaussian",
foldid = fold_ids,
standardize = TRUE
)
lasso
})
names(cvfits) <- repeat_ids
nonzero_coefs <- lapply(cvfits, function(cvfit) {
# get the nonzero coefficients
coefs <- coef(cvfit, s = "lambda.1se")
nonzero_coefs <- coefs[coefs[, 1] != 0, ]
sort(nonzero_coefs, decreasing = TRUE)
names(nonzero_coefs)
})
names(nonzero_coefs) <- repeat_ids
transposed_df <- stack(nonzero_coefs)
# remove intercept from values
transposed_df <- data.table::setDT(transposed_df) |> subset(values != "(Intercept)")
transposed_df
}, simplify = FALSE), idcol = 'recipe')
stability_dt[recipe == "general", Features := "Enable Features"]
stability_dt[recipe == "generalAccessPred", Features := "Accessible Features"]
stability_dt[, Features := factor(Features, levels = c("Enable Features", "Accessible Features"))]
ggplot(stability_dt, aes(x = reorder(values, ind, function(x) -length(x)), fill = values)) +
geom_bar() +
scale_fill_manual(values = feature_colors) +
facet_grid(Features ~.) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = 'none') +
labs(title = "Lasso Feature Robustness", x = "Feature", y = "Number of Times Selected")
ggsave(file.path(fig_dir, "lasso_feature_robustness_both.pdf"), width = 6, height = 5)
# Function to calculate luminance
calculate_luminance <- function(hex_color) {
rgb <- col2rgb(hex_color) / 255
# Apply relative luminance formula
luminance <- 0.2126 * rgb[1, ] + 0.7152 * rgb[2, ] + 0.0722 * rgb[3, ]
return(luminance)
}
# Decide text color (black or white) based on luminance threshold
text_colors <- sapply(feature_colors, function(color) {
if (calculate_luminance(color) > 0.5) {
"black" # Light background -> Black text
} else {
"white" # Dark background -> White text
}
})
selected_coefs <- sapply(names(fits)[endsWith(names(fits), 'lasso')], function(wflow_id){
selected_coefs <- fits[[wflow_id]] |>
extract_fit_parsnip() |> tidy() |> subset(estimate != 0) |>
mutate(abs_estimate = abs(estimate)) |> arrange(desc(abs_estimate)) |> select(-abs_estimate)
}, simplify = FALSE)
print(sub(pattern = "x (Intercept) ", replacement = "", x = selected_coefs$general_lasso[, paste(round(estimate, 1), term, sep = " x ", collapse = " + ")], fixed = TRUE))
## [1] "1520.2 + 92.3 x fT3 + 67.5 x FFM + 17.2 x MCHC + -14 x Mean Outdoor Temperature + 9.7 x Weight + 2.4 x GFR"
print(sub(pattern = "x (Intercept) ", replacement = "", x = selected_coefs$generalAccessPred_lasso[, paste(round(estimate, 1), term, sep = " x ", collapse = " + ")], fixed = TRUE))
## [1] "2369.6 + 69.2 x FFM + 13.8 x Weight + -6 x Age"
selected_coefs
## $general_lasso
## # A tidytable: 7 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 1520. 129.
## 2 fT3 92.3 129.
## 3 FFM 67.5 129.
## 4 MCHC 17.2 129.
## 5 Mean Outdoor Temperature -14.0 129.
## 6 Weight 9.70 129.
## 7 GFR 2.36 129.
##
## $generalAccessPred_lasso
## # A tidytable: 4 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 2370. 60.5
## 2 FFM 69.2 60.5
## 3 Weight 13.8 60.5
## 4 Age -6.03 60.5
selected_features <- selected_coefs$general_lasso |> subset(term != '(Intercept)') |> pull(term)
inaccessible_features <- selected_features[!selected_features %in% accessible_predictors]
# now let's make another recipe using only those features
selected_recipe <- general_recipe |>
update_role(all_of(general_predictors), new_role = "old_predictor") |>
update_role(all_of(microbiome_predictors), new_role = "old_predictor") |>
update_role(all_of(selected_features), new_role = "predictor")
quadratic_recipe <- selected_recipe |>
step_poly(all_numeric_predictors(), degree = 2)
interaction_recipe <- selected_recipe |>
step_interact(~ all_predictors():all_predictors())
selectedAccessible_recipe <- selected_recipe |>
update_role(all_of(inaccessible_features), new_role = "old_predictor")
selectedWTempAccessible_recipe <- selectedAccessible_recipe |>
update_role(all_of(temperature_predictor), new_role = "predictor")
new_recipes <- list(selected = selected_recipe,
quadratic = quadratic_recipe,
interaction = interaction_recipe,
selectedAccess = selectedAccessible_recipe,
selectedWTempAccess = selectedWTempAccessible_recipe
)
new_recipe_repeated_cv_results <- workflow_set(
preproc = new_recipes,
models = list(linear = lm_model)) |>
workflow_map(
fn = "fit_resamples",
seed = 1503,
resamples = repeated_cv_split,
verbose = TRUE,
metrics = my_metrics
) |> rank_results()
## i 1 of 5 resampling: selected_linear
## ✔ 1 of 5 resampling: selected_linear (2.5s)
## i 2 of 5 resampling: quadratic_linear
## ✔ 2 of 5 resampling: quadratic_linear (2.6s)
## i 3 of 5 resampling: interaction_linear
## ✔ 3 of 5 resampling: interaction_linear (2.6s)
## i 4 of 5 resampling: selectedAccess_linear
## ✔ 4 of 5 resampling: selectedAccess_linear (2.6s)
## i 5 of 5 resampling: selectedWTempAccess_linear
## ✔ 5 of 5 resampling: selectedWTempAccess_linear (2.6s)
# ggplot(new_recipe_repeated_cv_results, aes(x = rank, y = mean, color = wflow_id)) +
# geom_point() +
# geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), width = .5) +
# facet_wrap(~.metric, scales = 'free')
repeated_cv_result_dt <- rbind(repeated_cv_result_dt, new_recipe_repeated_cv_results)
# rerank the results based on the main metric
repeated_cv_result_dt[, wflow_id2 := factor(wflow_id, levels = repeated_cv_result_dt[.metric == main_metric][order(mean), wflow_id])]
# ggplot(repeated_cv_result_dt, aes(x = wflow_id2, y = mean, color = model)) +
# geom_point() +
# geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), width = .5) +
# facet_wrap(~.metric, scales = 'free') +
# # rotate x-axis labels 90 degree
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
recipe_list <- c(recipe_list, new_recipes)
new_models <- sapply(new_recipes,
function(this_recipe){
workflow() |>
add_recipe(this_recipe) |>
add_model(lm_model) |>
last_fit(split = enable_split, metrics = my_metrics)
}, simplify = FALSE)
names(new_models) <- paste(names(new_models), 'linear', sep = "_")
fits <- c(fits, new_models)
established_models <- c(`Harris-Benedict` = 'Harris-Benedict', WHO = 'WHO', `Kleiber` = 'Kleiber')
HarrisBenedict <- function(weight, height, age, sex_female){
ifelse(sex_female,
655.1 + 9.6*weight + 1.8*height - 4.7*age,
66.47 + 13.7*weight + 5*height - 6.8*age) * conversion_factor
}
Kleiber <- function(weight){
283 * weight^0.75
}
WHO <- function(weight, age, sex_female) {
# Initialize RMR vector
RMR <- numeric(length(weight))
# Define reusable age filters
age_filter <- list(
age_3_10 = age <= 10,
age_10_30 = age >= 18 & age <= 30,
age_30_60 = age > 30 & age <= 60,
age_60_plus = age > 60
)
# Calculate RMR for each age group using `ifelse` conditioned on `sex_female`
RMR[age_filter$age_3_10] <- ifelse(sex_female[age_filter$age_3_10],
(22.5 * weight[age_filter$age_3_10] + 499),
(22.7 * weight[age_filter$age_3_10] + 495))
RMR[age_filter$age_10_30] <- ifelse(sex_female[age_filter$age_10_30],
(14.7 * weight[age_filter$age_10_30] + 496),
(15.3 * weight[age_filter$age_10_30] + 679))
RMR[age_filter$age_30_60] <- ifelse(sex_female[age_filter$age_30_60],
(8.7 * weight[age_filter$age_30_60] + 829),
(11.6 * weight[age_filter$age_30_60] + 879))
RMR[age_filter$age_60_plus] <- ifelse(sex_female[age_filter$age_60_plus],
(10.5 * weight[age_filter$age_60_plus] + 596),
(13.5 * weight[age_filter$age_60_plus] + 487))
return(RMR*conversion_factor)
}
interesting_models <- c(`Lasso Enable` = "general_lasso",
`Lasso Accessible` = "generalAccessPred_lasso",
established_models
)
# Swap names and values
interesting_modelmap <- setNames(names(interesting_models), interesting_models)
repeated_cv_result_dt[wflow_id %in% interesting_models, wflow_id := interesting_modelmap[wflow_id]]
# get the fits for the interesting models
interesting_fits <- fits[interesting_models]
# remove NULLs
interesting_fits <- interesting_fits[!sapply(interesting_fits, is.null)]
interesting_model_estimates <- data.table::rbindlist(lapply(interesting_fits, function(fit) fit |> extract_fit_parsnip() |> tidy() |> subset(estimate != 0) |> mutate(abs_estimate = abs(estimate)) |> arrange(desc(abs_estimate)) |> select(-abs_estimate)), idcol = 'model', fill = TRUE)
interesting_model_estimates[, model := interesting_modelmap[model]]
interesting_model_estimates[, term := gsub(pattern = '`', replacement = '', x = term, fixed = TRUE)]
print(interesting_model_estimates[term != '(Intercept)', paste(.SD[order(-abs(estimate)), term], collapse = "; "), by = model])
## model V1
## <char> <char>
## 1: Lasso Enable fT3; FFM; MCHC; Mean Outdoor Temperature; Weight; GFR
## 2: Lasso Accessible FFM; Weight; Age
interesting_metrics <- c(RMSE = 'rmse', R2 = 'rsq_trad')
rounding_by <- c(
RMSE = 0,
R2 = 3
)
interesting_metricmap <- setNames(names(interesting_metrics), interesting_metrics)
predict_data_fits <- function(fits, recipe_list, truth, new_data = NULL) {
data.table::rbindlist(
sapply(names(fits), function(this_wflow_id) {
last_fit <- fits[[this_wflow_id]]
this_recipe_name <- sub(pattern = '_.*$', replacement = '', this_wflow_id)
this_recipe <- recipe_list[[this_recipe_name]]
prepped_data <- this_recipe |> step_select(all_of(c(all_predictors(), all_outcomes()))) |> prep() |> bake(new_data = new_data)
last_fit |> extract_fit_parsnip() |> predict(new_data = prepped_data) |> mutate(truth = truth)
}, simplify = FALSE),
idcol = 'model'
) |> rename(estimate = .pred)
}
predict_data_established <- function(new_data, truth){
data.table::rbindlist(list(
data.table::data.table(model = established_models['Harris-Benedict'],
estimate = HarrisBenedict(weight = new_data[[weight_predictor]],
height = new_data[[height_predictor]],
age = new_data[[age_predictor]],
new_data[[sex_predictor]] == 'weiblich'),
truth = truth),
data.table::data.table(model = established_models['Kleiber'],
estimate = Kleiber(weight = new_data[[weight_predictor]]),
truth = truth),
data.table::data.table(model = established_models['WHO'],
estimate = WHO(weight = new_data[[weight_predictor]],
age = new_data[[age_predictor]],
sex_female = new_data[[sex_predictor]] == 'weiblich'),
truth = truth)
)
)
}
compute_my_metrics <- function(preds, my_metrics, metrics_to_keep, n_bootstrap_samples = 10000, seed = 4735684){
metric_dt <- preds[, my_metrics(.SD, truth = truth, estimate = estimate), by=model]
if (n_bootstrap_samples > 0) {
set.seed(seed)
# # data.table only solution is too slow
# # Get sizes per model
# model_sizes <- preds[ , .N, by = model]
# # Create bootstrap sample indices
# boot_dt <- model_sizes[ , .(
# bootstrap_ind = rep(1:n_bootstrap_samples, each = N),
# row_id = sample(seq.int(N), size = N * n_bootstrap_samples, replace = TRUE)
# ), by = model]
# # Now compute metrics per bootstrap sample
# bootstrap_metrics <- boot_dt[preds, on = .(model, row_id)][ , my_metrics(.SD, truth = truth, estimate = estimate), by = .(bootstrap_ind, model)]
bootstrap_indices <- replicate(n_bootstrap_samples, sample(preds[ , .N, by = model][, unique(N)], replace = TRUE), simplify = FALSE)
bootstrap_metrics <- data.table::rbindlist(pbmcapply::pbmclapply(bootstrap_indices, function(indices) {
preds[ , my_metrics(.SD[indices], truth = truth, estimate = estimate), by=model]
}, mc.cores = min(n_bootstrap_samples/10, parallel::detectCores() / 2)), idcol = 'bootstrap_ind')
metric_dt <- rbind(metric_dt, bootstrap_metrics, use.names = TRUE, fill = TRUE)
# metric_int <- bootstrap_metrics[, .(lower = quantile(.estimate, 0 + ((1 - .9)/2)), mean = mean(.estimate), higher = quantile(.estimate, 1 - ((1 - .9)/2))), by = .(model, .metric)]
# metric_dt[metric_int, on = .NATURAL, c("lower", "mean", "higher") := .(lower, mean, higher)]
}
# metric_dt <- data.table::dcast(metric_dt, model ~ .metric, value.var = '.estimate')
# data.table::setnames(metric_dt, old = metrics_to_keep, new = names(metrics_to_keep))
# metric_dt[, metrics_string := do.call(
# paste,
# c(Map(function(name, value) paste(name, round(value, rounding_by[name]), sep = ": "), names(metrics_to_keep), mget(names(metrics_to_keep))), sep = "\n ")
# )]
metric_dt
}
data_by_site <- split(all_data, all_data$Site)
data_by_site$KielWoTemp <- data.table::copy(data_by_site$Kiel)
data_by_site$KielWoTemp[, (temperature_predictor) := NA]
bmi_dt <- all_data[, .(Site, response_check=get(response_variable), BMI = Weight/((Height/100)^2))]
preds_by_site <- sapply(names(data_by_site), function(this_data_name){
this_data <- data_by_site[[this_data_name]]
fit_preds <- predict_data_fits(fits = fits,
recipe_list = recipe_list,
truth = this_data[[response_variable]],
new_data = this_data)
fit_preds <- rbind(
fit_preds,
predict_data_established(new_data = this_data, truth = this_data[[response_variable]])
)
# match the bmi to the predictions
bmi_site <- bmi_dt[, unique(Site)[sapply(as.character(unique(Site)), function(site) startsWith(this_data_name, site))]]
fit_preds[, c("BMI", "response_check") := bmi_dt[Site == bmi_site, .(BMI, response_check)], by = 'model']
stopifnot(fit_preds[, identical(truth, response_check)])
fit_preds[, response_check := NULL]
fit_preds
}, simplify = FALSE)
metrics_by_site <- lapply(preds_by_site, compute_my_metrics, my_metrics = my_metrics, metrics_to_keep = interesting_metrics)
pred_dt <- data.table::rbindlist(preds_by_site, idcol = 'Site')
metric_dt <- data.table::rbindlist(metrics_by_site, idcol = 'Site')
metric_dt[.metric %in% interesting_metrics, .metric := interesting_metricmap[.metric]]
pred_dt[, Site := factor(Site, levels = c(all_data[, levels(Site)], "KielWoTemp"))]
metric_dt[, Site := factor(Site, levels = c(all_data[, levels(Site)], "KielWoTemp"))]
pred_dt[model %in% interesting_models, model := interesting_modelmap[model]]
metric_dt[model %in% interesting_models, model := interesting_modelmap[model]]
# sort pred_dt and metric_dt
pred_dt[, model := factor(model, levels = c(interesting_modelmap, unique(model[!model %in% interesting_modelmap])))]
metric_dt[, model := factor(model, levels = c(interesting_modelmap, unique(model[!model %in% interesting_modelmap])))]
pred_dt[grep("lasso", model, ignore.case = TRUE), `Model Type` := "Lasso"]
pred_dt[model %in% established_models, `Model Type` := "Established"]
model_type_colors <- c(Established = "#66C2A5",
Lasso = "#FC8D62",
Nnet = "#8DA0CB",
RF = "#E78AC3",
Linear = "#A6D854")
repeated_cv_result_dt[, c("prepping", "Model Type") := data.table::tstrsplit(as.character(wflow_id2), "_", fixed = TRUE)]
repeated_cv_result_dt[, `Model Type` := tools::toTitleCase(`Model Type`)]
repeated_cv_result_dt[, `Accessible Predictors` := grepl(pattern = "Access", x = prepping)]
ggplot(repeated_cv_result_dt[!wflow_id2 %in% names(new_models) & .metric == main_metric], aes(x = reorder(wflow_id, mean), y = mean, color = `Model Type`, shape = `Accessible Predictors`)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), width = .5) +
scale_shape_manual(values = c(5, 16)) +
scale_color_manual(values = model_type_colors) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
legend.position = c(.01, .99),
legend.justification = c("left", "top"),
legend.box.background = element_rect(linewidth = .1),
legend.background = element_rect(fill = NA))+
labs(x = NULL, y = interesting_metricmap[main_metric], title = "Cross-Validation Summary")
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggsave(filename = file.path(fig_dir, "cv_summary.pdf"), width = 5, height = 4)
require(tidytext)
## Loading required package: tidytext
summary_metrics <- data.table::copy(metric_dt)
summary_metrics[Site == "KielWoTemp", Site := "Kiel Without Temperature"]
data.table::setnames(summary_metrics, c(".metric", ".estimate"), c("metric_name", "metric"))
summary_metrics[grep("linear", model, ignore.case = TRUE), `Model Type` := "Linear"]
summary_metrics[grep("lasso", model, ignore.case = TRUE), `Model Type` := "Lasso"]
summary_metrics[model %in% established_models, `Model Type` := "Established"]
wide_bootstrap_res <- data.table::dcast(summary_metrics, formula = Site + metric_name + bootstrap_ind ~ model, value.var = "metric")
pairwise_comparisons <- data.table::CJ(model1 = ordered(interesting_modelmap, levels = interesting_modelmap),
model2 = ordered(interesting_modelmap, levels = interesting_modelmap))[model1 < model2]
pairwise_tests <- wide_bootstrap_res[,
.(list = lapply(1:nrow(pairwise_comparisons), function(i){
m1 <- as.character(pairwise_comparisons[i, model1])
m2 <- as.character(pairwise_comparisons[i, model2])
delta <- get(m1) - get(m2)
# Observed difference
delta_obs <- delta[is.na(bootstrap_ind)]
# Bootstrap differences centered at their bootstrap mean
delta_star <- delta[!is.na(bootstrap_ind)]
delta_star_centered <- delta_star - mean(delta_star)
# p-value: proportion of bootstrap differences >= observed difference
p_value <- mean(abs(delta_star_centered) >= abs(delta_obs))
.(.y. = 'metric',
p = p_value,
mean_delta = mean(delta),
group1 = m1,
group2 = m2)
})), by = .(Site, metric_name)][, data.table::rbindlist(list), by = .(Site, metric_name)]
# interesting_comparisons <- data.table(group1 = interesting_modelmap[-length(interesting_modelmap)], group2 = interesting_modelmap[-1])
pairwise_tests[, group1 := paste(group1, Site, sep = "___")]
pairwise_tests[, group2 := paste(group2, Site, sep = "___")]
pairwise_tests_summary <- pairwise_tests[metric_name %in% interesting_metricmap &
Site %in% c("Freising", "Nuremberg", "Kiel")]
pairwise_tests_summary[, p.adj := p.adjust(p, method = "BH"), by = .(Site, metric_name)]
pairwise_tests_summary[, p.signif := as.character(stats::symnum(x = p.adj,
cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1),
symbols = c("****", "***", "**", "*", "ns")))]
score_summary_metrics <- summary_metrics[model %in% interesting_modelmap &
metric_name %in% interesting_metricmap &
Site %in% c("Freising", "Nuremberg", "Kiel")]
pairwise_tests_summary[score_summary_metrics[!is.na(bootstrap_ind),
.(y.position = max(metric)),
by = .(Site, metric_name)],
on = .NATURAL,
y.position := y.position]
pairwise_tests_summary[, y.position := y.position + y.position * (.05 * (seq.int(.N) - 1)), by = .(Site, metric_name)]
pairwise_tests_summary[metric_name == "RMSE", y.position := y.position - 200]
score_summary <- ggplot(score_summary_metrics[!is.na(bootstrap_ind)],
aes(x = tidytext::reorder_within(model, metric, Site), y = metric)) +
geom_col(data = score_summary_metrics[is.na(bootstrap_ind)], aes(fill = `Model Type`), color = 'white', alpha = 0.7) +
stat_summary(fun.data = function(x) list(y = mean(x), ymin = quantile(x, 0 + ((1 - .9)/2)), ymax = quantile(x, 1 - ((1 - .9)/2))),
geom = "pointrange", fatten = 2) +
stat_pvalue_manual(pairwise_tests_summary,
label = "p.signif", tip.length = 0, size = 2) +
scale_fill_manual(values = model_type_colors) +
coord_cartesian(ylim = c(0, NA)) +
# add the value as text into the bar, ninety degrees rotated
geom_text(data = score_summary_metrics[is.na(bootstrap_ind)], aes(label = round(metric, rounding_by[metric_name]), y = 0), hjust = -.15, vjust = 1.4, angle = 90) +
facet_grid(metric_name ~ Site, scales = "free") +
labs(x = NULL, y = NULL) +
tidytext::scale_x_reordered() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "left")
ggsave(filename = file.path(fig_dir, "score_summary.pdf"), plot = score_summary, width = 8, height = 6)
score_summary_metrics_kiel <- summary_metrics[model %in% interesting_modelmap[!interesting_modelmap %in% established_models] &
metric_name %in% interesting_metricmap &
Site %in% c("Kiel", "Kiel Without Temperature")]
pairwise_tests_summary_kiel <-
pairwise_tests[grepl(pattern = paste(interesting_modelmap[!interesting_modelmap %in% established_models], collapse = "|"), x = group1) &
grepl(pattern = paste(interesting_modelmap[!interesting_modelmap %in% established_models], collapse = "|"), x = group2) &
metric_name %in% interesting_metricmap &
Site %in% c("Kiel", "Kiel Without Temperature")]
pairwise_tests_summary_kiel[, p.signif := as.character(stats::symnum(x = p.adjust(p, method = "BH"),
cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1),
symbols = c("****", "***", "**", "*", "ns"))),
by = .(Site, metric_name)]
pairwise_tests_summary_kiel[score_summary_metrics_kiel[!is.na(bootstrap_ind),
.(y.position = max(metric)),
by = .(Site, metric_name)],
on = .NATURAL,
y.position := y.position]
pairwise_tests_summary_kiel[, y.position := y.position + y.position * (.05 * (seq.int(.N) - 1)), by = .(Site, metric_name)]
score_summary_kiel <- ggplot(score_summary_metrics_kiel[!is.na(bootstrap_ind)],
aes(x = tidytext::reorder_within(model, metric, Site), y = metric)) +
geom_col(data = score_summary_metrics_kiel[is.na(bootstrap_ind)], aes(fill = `Model Type`), color = 'white', alpha = 0.7) +
stat_summary(fun.data = function(x) list(y = mean(x), ymin = quantile(x, 0 + ((1 - .9)/2)), ymax = quantile(x, 1 - ((1 - .9)/2))),
geom = "pointrange", fatten = 2) +
stat_pvalue_manual(pairwise_tests_summary_kiel, label = "p.signif", tip.length = 0, size = 2) +
scale_fill_manual(values = model_type_colors) +
# add the value as text into the bar, ninety degrees rotated
geom_text(data = score_summary_metrics_kiel[is.na(bootstrap_ind)], aes(label = round(metric, rounding_by[metric_name]), y = 0), hjust = -.15, angle = 90) +
facet_grid(metric_name ~ Site, scales = "free") +
labs(x = NULL, y = NULL) +
tidytext::scale_x_reordered() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
ggsave(filename = file.path(fig_dir, "score_summary_kiel.pdf"), plot = score_summary_kiel, width = 4, height = 4)
plot_preds <- function(preds, metric_dt, main_metric, facet_col = "Site", facet_row = "model", color_col = "Model Type", focus_observed = FALSE){
p <- ggplot(preds, aes(x = truth, y = estimate, color = get(color_col))) +
geom_abline(color = "gray50", lty = 2) +
geom_point(alpha = 0.5) +
labs(title = NULL, x = 'Observed RMR in kJ/day', y = 'Predicted RMR in kJ/day') +
theme(legend.position = "none") +
geom_label(data = metric_dt, aes(x = -Inf, y = Inf, label = metrics_string),
vjust = 1.1, hjust = -0.1, inherit.aes = FALSE, size = 3)
if (preds[, !is.numeric(get(color_col))])
p <- p + scale_color_manual(values = model_type_colors)
else
p <- p + scale_color_gradient2()
if (facet_col != "" & facet_row != "") {
p <- p + facet_grid(get(facet_col) ~ get(facet_row))
}
if (focus_observed) {
p <- p +
coord_obs_pred(xlim=c(min(preds$truth), max(preds$truth)),
ylim=c(min(preds$truth), max(preds$truth)))
} else {
p <- p +
coord_obs_pred()
}
p
}
plot_bland_altman <- function(preds, metric_dt, main_metric, facet_col = "Site", facet_row = "model", color_col = "Model Type") {
preds[, mean_measurement := (truth + estimate) / 2]
preds[, difference := truth - estimate]
if (facet_col != "" & facet_row != "") {
preds[, mean_difference := mean(difference), by=.(get(facet_row), get(facet_col))]
preds[, lower_difference:= mean_difference - sd(difference)*1.96, by=.(get(facet_row), get(facet_col))]
preds[, upper_difference:= mean_difference + sd(difference)*1.96, by=.(get(facet_row), get(facet_col))]
} else {
preds[, mean_difference := mean(difference)]
preds[, lower_difference:= mean_difference - sd(difference)*1.96]
preds[, upper_difference:= mean_difference + sd(difference)*1.96]
}
p <- ggplot(preds, aes(x = mean_measurement, y = difference, color = get(color_col))) +
geom_hline(aes(yintercept = mean_difference)) +
geom_hline(aes(yintercept = lower_difference), linetype = 'dashed') +
geom_hline(aes(yintercept = upper_difference), linetype = 'dashed') +
geom_point(alpha = 0.5) +
labs(title = NULL, x = 'Mean of Observed and Predicted', y = 'Observed - Predicted') +
theme(legend.position = "none")
if (preds[, !is.numeric(get(color_col))])
p <- p + scale_color_manual(values = model_type_colors)
if (facet_col != "" & facet_row != "") {
p <- p + facet_grid(get(facet_col) ~ get(facet_row))
}
p
}
sites <- c("Freising", "Nuremberg", "Kiel", "KielWoTemp")
models <- c(interesting_modelmap[startsWith(interesting_modelmap, "Lasso")],
established_models)
pred_dt[, `BMI Category` := cut(BMI, breaks = c(0, 16, 17, 18.5, 25, 30, 35, 40, Inf), right = FALSE)]
pred_dt[, `BMI Category` := factor(`BMI Category`, levels = c("[0,16)", "[16,17)", "[17,18.5)", "[18.5,25)", "[25,30)", "[30,35)", "[35,40)", "[40,Inf)"))]
bmi_mapping <- c(`[0,16)` = "Underweight (Severe thinness)",
`[16,17)` = "Underweight (Moderate thinness)",
`[17,18.5)` = "Underweight (Mild thinness)",
`[18.5,25)` = "Normal range",
`[25,30)` = "Overweight (Pre-obese)",
`[30,35)` = "Obese (Class I)",
`[35,40)` = "Obese (Class II)",
`[40,Inf)` = "Obese (Class III)")
pred_dt[, `BMI Category` := bmi_mapping[`BMI Category`]]
pred_dt[, `BMI Category` := ordered(`BMI Category`, levels = bmi_mapping)]
bmi_colors <- c("#7c7cbd", "#7c7cfd", "#7cfcfc", "#7cfc7c", "#fcfc7c", "#fcbb91", "#fc9192", "#c08081")
names(bmi_colors) <- bmi_mapping
metric_string_dt <- data.table::dcast(metric_dt[is.na(bootstrap_ind)], model + Site ~ .metric, value.var = '.estimate')[, metrics_string := do.call(
paste,
c(Map(function(name, value) paste(name, round(value, rounding_by[name]), sep = ": "), interesting_metricmap, mget(interesting_metricmap)), sep = "\n ")
)]
pred_obs <-
plot_bland_altman(
pred_dt[model %in% models &
Site %in% sites],
metric_string_dt[model %in% models &
Site %in% sites],
main_metric = interesting_metricmap[main_metric],
facet_col = "Site",
facet_row = "model",
color_col = "BMI Category"
)
# print(pred_obs +
# geom_point(alpha = 1, size = .5) +
# guides(color = guide_legend(byrow = T)) +
# scale_color_manual(values = bmi_colors) +
# theme(legend.position = "top") + labs(color = "BMI Category"))
print(pred_obs +
geom_point(alpha = 1, size = .4) +
guides(color = guide_legend(byrow = T)) +
scale_color_brewer(palette = "RdYlBu", direction = -1) +
theme(legend.position = "top") + labs(color = "BMI Category"))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# print(pred_obs +
# geom_point(alpha = 1, size = .5) +
# guides(color = guide_legend(byrow = T)) +
# scale_color_brewer(type = "div", direction = -1) +
# theme(legend.position = "top") + labs(color = "BMI Category"))
library(patchwork)
pred_plots <-
mapply(
sites = list(c("Freising", "Nuremberg", "Kiel", "KielWoTemp"),
c("Freising")),
models = list(
c(interesting_modelmap[startsWith(interesting_modelmap, "Lasso")],
established_models
),
c("Lasso Enable")),
fig.sizes = list(c(15, 15),
c(4, 4)),
FUN = function(sites, models, fig.sizes) {
fig_suffix <- paste0(paste(sites, collapse="-"), "_", paste(models, collapse="-"), ".pdf")
pred_obs <-
plot_preds(
pred_dt[model %in% models &
Site %in% sites],
metric_string_dt[model %in% models &
Site %in% sites],
main_metric = interesting_metricmap[main_metric],
facet_col = ifelse(length(sites) > 1, "Site", ""),
facet_row = ifelse(length(models) > 1, "model", ""),
focus_observed = TRUE
)
print(pred_obs)
ggsave(filename = file.path(fig_dir, paste0("pred_obs_", fig_suffix)), plot = pred_obs, width = fig.sizes[1], height = fig.sizes[2])
ba <-
plot_bland_altman(
pred_dt[model %in% models &
Site %in% sites],
metric_string_dt[model %in% models &
Site %in% sites],
main_metric = interesting_metricmap[main_metric],
facet_col = ifelse(length(sites) > 1, "Site", ""),
facet_row = ifelse(length(models) > 1, "model", ""),
)
print(ba)
ggsave(filename = file.path(fig_dir, paste0("ba_", fig_suffix)), plot = ba, width = fig.sizes[1], height = fig.sizes[2])
ggsave(filename = file.path(fig_dir, paste0("ba_pred_obs_", fig_suffix)), plot = (pred_obs + ba), width = fig.sizes[1]*2 - 1, height = fig.sizes[2])
list(
pred_obs = pred_obs,
ba = ba
)
},
SIMPLIFY = FALSE
)
pred_obs_freising <- pred_plots[[2]]$pred_obs
pred_obs_score_summary <- ggarrange(pred_obs_freising, score_summary, widths = c(4,8), labels = c("A", "B"))
pred_obs_score_summary
ggsave(file = file.path(fig_dir, "pred_obs_score_summary.pdf"), plot = pred_obs_score_summary, width = 12, height = 6)
require(ggrepel)
## Loading required package: ggrepel
selected_lm_fits <- sapply(new_models,
function(model) {
lm_fit <- model |> extract_fit_engine()
coef_mat <- lm_fit |> summary() |> coefficients()
# coef_mat[coef_mat[, 'Pr(>|t|)'] < 0.05, , drop = FALSE] |> print()
lm_fit
}, simplify = FALSE)
feature_imps <- c("lmg", "pmvd")
linear_imp <- relaimpo::calc.relimp(selected_lm_fits$selected_linear, type = feature_imps)
expl_variance <- data.table::setDT(enframe(c(linear_imp$pmvd, `Intra-Individual Variation` = 0.043, Unexplained = 0), 'Feature', 'Explained Variance'))
expl_variance[Feature == 'Unexplained', `Explained Variance` := 1 - sum(expl_variance$`Explained Variance`)]
expl_variance[, Feature:=factor(Feature, levels=c(names(sort(linear_imp$pmvd, decreasing = TRUE)), 'Unexplained', 'Intra-Individual Variation'))]
data.table::setkey(expl_variance, Feature)
expl_variance[, cum_var := cumsum(`Explained Variance`)]
expl_variance[, x_space := cum_var - `Explained Variance`/2]
expl_variance_plot <- ggplot(expl_variance, aes(x = `Explained Variance`, y = 'Feature', fill = reorder(Feature, -x_space))) +
geom_col(position = 'fill') +
scale_fill_manual(values = feature_colors, guide = guide_legend(reverse = TRUE, nrow = 1)) +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.position = 'top') +
ggrepel::geom_label_repel(aes(label = round(`Explained Variance` * 100, 1),
x = x_space),
max.overlaps = 10,
min.segment.length = 0,
force = 10,
direction = 'y',
show.legend = FALSE,
color = text_colors[expl_variance[, levels(Feature)]],
segment.color = text_colors[expl_variance[, levels(Feature)]]) +
labs(y = NULL, fill = NULL)
ggsave(filename = file.path(fig_dir, "selected_features_expl_variance.pdf"), plot = expl_variance_plot, width = 8.5, height = 3)
scatter <- ggscatterhist(all_data, x = "Mean Outdoor Temperature", y = "RMR", color = "Site", xlab = "Mean Daily Temperature in °C", ylab = "RMR in kJ", alpha=.5, print = FALSE, legend = "bottom")
scatter$sp <- scatter$sp + geom_smooth(method = 'lm', se = FALSE, aes(color = Site)) + stat_cor(size = 3, aes(color = Site))
pdf(file.path(fig_dir, "scatter_temp_rmr.pdf"), width = 7, height = 7)
scatter
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
dev.off()
## png
## 2
print(scatter)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
scatter_kiel_scores <- ggpubr:::.insert_xaxis_grob(scatter$sp, scatter$xplot, grid::unit(1/5,
"null"), position = "top")
## `geom_smooth()` using formula = 'y ~ x'
scatter_kiel_scores <- ggpubr:::.insert_yaxis_grob(scatter_kiel_scores, scatter$yplot, grid::unit(1/5,
"null"), position = "right")
scatter_kiel_scores <- ggarrange(as_ggplot(scatter_kiel_scores), score_summary_kiel, labels = c("A", "B"))
ggsave(filename = file.path(fig_dir, "scatter_temp_rmr_kiel_scores.pdf"), plot = scatter_kiel_scores, width = 14, height = 7)
library(pheatmap)
library(seriation)
cor_method <- "pearson"
cor_mat <-
general_recipe |> step_select(c(all_predictors(), all_outcomes())) |> prep() |> bake(NULL) |> cor(use = "pairwise", method = cor_method)
colnames(cor_mat)[colnames(cor_mat) == "Sex_weiblich"] <- "Sex (f)"
rownames(cor_mat)[rownames(cor_mat) == "Sex_weiblich"] <- "Sex (f)"
microbiome_cor_mat <- round(cor_mat[column_groups[group %in% c("microbiome family", "microbiome diversity"), column], "RMR", drop = FALSE], 3)
microbiome_cor_mat <- microbiome_cor_mat[order(microbiome_cor_mat[, "RMR"], decreasing = TRUE), , drop=FALSE]
colnames(microbiome_cor_mat)[colnames(microbiome_cor_mat) == "RMR"] <- "RMR (kJ/day)"
write.table(microbiome_cor_mat, quote = FALSE, sep = ',')
## RMR (kJ/day)
## f__Sutterellaceae,0.16
## f__Prevotellaceae,0.135
## f__unknown_ Bacteroidetes,0.129
## f__Succinivibrionaceae,0.111
## f__Acidaminococcaceae,0.107
## f__Coriobacteriaceae,0.103
## f__Erysipelotrichaceae,0.101
## f__Enterococcaceae,0.073
## f__Veillonellaceae,0.066
## f__Lachnospiraceae,0.052
## f__Pasteurellaceae,0.037
## f__Clostridiaceae 1,0.035
## f__Comamonadaceae,0.035
## f__Rhodospirillaceae,0.033
## f__Enterobacteriaceae,0.032
## f__Fusobacteriaceae,0.017
## f__Bifidobacteriaceae,0.014
## f__Peptococcaceae 1,0.011
## f__Eubacteriaceae,-0.003
## f__Peptostreptococcaceae,-0.01
## f__Spirochaetaceae,-0.011
## f__Lactobacillaceae,-0.022
## f__unknown_ Desulfovibrionales,-0.032
## f__Methanobacteriaceae,-0.036
## f__unknown_ Rhodospirillales,-0.036
## f__Puniceicoccaceae,-0.038
## f__Bacteroidaceae,-0.042
## f__unknown_ Proteobacteria,-0.057
## f__Streptococcaceae,-0.061
## f__unknown_ Alphaproteobacteria,-0.064
## f__Victivallaceae,-0.07
## f__unknown_ Burkholderiales,-0.072
## f__unknown_ Clostridia,-0.072
## f__Elusimicrobiaceae,-0.078
## f__Desulfovibrionaceae,-0.082
## f__Verrucomicrobiaceae,-0.097
## f__unknown_ Bacteroidales,-0.1
## f__Anaeroplasmataceae,-0.104
## f__unknown_ Deltaproteobacteria,-0.104
## f__unknown_ Bacteria,-0.122
## Simpson.effective,-0.128
## f__unknown_ Firmicutes,-0.136
## Shannon.effective,-0.147
## f__unknown_ Clostridiales,-0.156
## f__Rikenellaceae,-0.174
## f__Ruminococcaceae,-0.188
## f__Porphyromonadaceae,-0.219
non_microbiome_cor_mat <- round(cor_mat[!rownames(cor_mat) %in% rownames(microbiome_cor_mat), "RMR", drop = FALSE], 3)
non_microbiome_cor_mat <- non_microbiome_cor_mat[order(non_microbiome_cor_mat[, "RMR"], decreasing = TRUE), , drop=FALSE]
colnames(non_microbiome_cor_mat)[colnames(non_microbiome_cor_mat) == "RMR"] <- "RMR (kJ/day)"
write.table(non_microbiome_cor_mat, quote = FALSE, sep = ',')
## RMR (kJ/day)
## RMR,1
## FFM,0.853
## Weight,0.722
## Height,0.706
## Hemoglobin,0.538
## Waist,0.493
## Hematocrit,0.484
## Uric Acid,0.471
## Visceral Fat,0.456
## MCHC,0.422
## fT3,0.404
## Hip,0.349
## GPT ALT,0.343
## Creatinine,0.334
## Ferritin,0.281
## Triglycerides,0.279
## GGT,0.251
## BP Diastolic,0.247
## Iron,0.204
## GFR,0.203
## BP Systolic,0.199
## propionic acid a,0.187
## Glucose,0.185
## FM,0.182
## GOT AST,0.171
## Insulin,0.159
## butyric acid a,0.12
## Urea,0.099
## acetic acid a,0.074
## Leukocytes,0.059
## Body Temperature,0.051
## TSH,0.043
## Heart Rate,0.035
## Isobutyrate a,0.022
## Isovalerate a,0.002
## LDH,-0.002
## 2-Methylbutyrate a,-0.005
## pentanoic acid a,-0.009
## Lactic acid a,-0.019
## 4-Methylvaleric acid a,-0.028
## LDL,-0.038
## hexanoic acid c,-0.039
## hsCRP,-0.058
## Mean Outdoor Temperature,-0.147
## Age,-0.184
## Cholesterol,-0.199
## HDL,-0.49
## Sex (f),-0.658
dist_obj = as.dist(1 - cor_mat)
clust <- hclust(dist_obj, method = "complete") # do not use "ward.D2" here, because we use correlation distance
clust <-
seriation:::reorder.hclust(clust, dist = dist_obj, method = "OLO")
data.table::setkey(column_groups, column)
anno_df <- as.data.frame(column_groups[, "group", with = FALSE], row.names = column_groups[, column])
rownames(anno_df)[rownames(anno_df) == "Sex"] <- "Sex (f)"
anno_df$group <- tools::toTitleCase(as.character(anno_df$group))
my_heatmap <- pheatmap(
cor_mat,
cluster_rows = clust,
treeheight_col = 0,
cluster_cols = clust,
annotation_row = anno_df[clust$labels[clust$order], , drop = FALSE],
# annotation_row = anno_df[clust$labels[clust$order], , drop = FALSE],
color = colorRampPalette(rev(
RColorBrewer::brewer.pal(n = 7, name = "RdBu")
))(100),
silent = TRUE
)
pdf(file.path(fig_dir, paste0(
"correlation_heatmap_", cor_method, ".pdf"
)), width = 19, height = 14)
grid::grid.newpage()
grid::grid.draw(my_heatmap$gtable)
dev.off()
## png
## 2
grid::grid.newpage()
grid::grid.draw(my_heatmap$gtable)